#Data Handling
import numpy as np
import pandas as pd
from sklearn.preprocessing import minmax_scale
#Visualization
from matplotlib import pyplot as plt
%matplotlib inline
from IPython.display import display
import seaborn as sns
import plotly.offline as plotly
plotly.offline.init_notebook_mode()
from plotly.graph_objs import *
#Feature extraction
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage
from sklearn.cluster.bicluster import SpectralBiclustering, SpectralCoclustering
#Clustering
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import NMF
#Misc
from scipy.signal import argrelextrema
from math import floor
from os import listdir
metadata = pd.read_csv("sample_sheet_Patrick_AdiposeMacs.txt", sep="\t")
metadata["#id"] = metadata["#id"].astype(str)
metadata
#Import data from files
study_data = None
for fname in listdir("htseq_count/"):
exp_id_long = fname.split("_")[4].split(".")[0]
exp_id = exp_id_long.lstrip("0")
exp_data = pd.read_csv("htseq_count/"+fname, sep="\t", header=None, names=["gene", str(exp_id)])
if study_data is None:
study_data = exp_data
else:
study_data = study_data.merge(exp_data, on="gene", how="outer")
#Clean up
study_data.index = study_data["gene"].values
del study_data["gene"]
study_data = study_data.T
study_data = study_data.merge(metadata, left_index=True, right_on="#id")
study_data.index = study_data.apply(lambda x: x["#id"] + " " + x["series"], axis=1)
sample_readinfo = study_data.iloc[:,-12:-1]
study_data = study_data.iloc[:,:-12]
#Display dfs
display(study_data.head())
display(sample_readinfo.head())
#Sanity check
if study_data.isnull().sum().sum() != 0:
print("Some genes missing from count file")
count_matrix = minmax_scale(study_data.as_matrix(), axis=0)
heatmap = Data([
Heatmap(
x = study_data.columns,
y = study_data.index,
z = count_matrix,
colorscale='Viridis'
)
])
layout = Layout(
title='Gene Expression Heatmap',
yaxis=dict(type="category"),
margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
p = sns.barplot(study_data.index, study_data.sum(axis=1).as_matrix())
p.set_xticklabels(study_data.index, rotation=90)
plt.title("Gene Count Intensity Per Experiment")
plt.show()
norm_count_matrix = study_data.as_matrix() / study_data.sum(axis=1)[:, None]
norm_count_matrix = norm_count_matrix[:, norm_count_matrix.sum(axis=0) > 0]
norm_count_matrix = minmax_scale(norm_count_matrix)
print(count_matrix.shape, norm_count_matrix.shape)
heatmap = Data([
Heatmap(
x = study_data.columns,
y = study_data.index,
z = norm_count_matrix,
colorscale='Viridis'
)
])
layout = Layout(
title='Gene Expression Heatmap Normalized By Per Experiment Intensity',
yaxis=dict(type="category"),
margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
def plotClusters(comp1, comp2, comp3, treatments, clusters, title=""):
visrep = Scatter3d(
x=comp1,
y=comp2,
z=comp3,
text=treatments,
mode='markers',
marker=dict(
size=12,
color=clusters,
colorscale='Viridis',
opacity=0.8,
line = dict(
width = 0,
color = "black"
)
)
)
fig = Figure(data=[visrep], layout=Layout(title=title))
plotly.iplot(fig)
#PCA Decomposition to support later visualization
dim_red = PCA(n_components=10)
dim_red_genes = dim_red.fit_transform(norm_count_matrix)
print("% Variance Explained:", dim_red.explained_variance_ratio_.sum()*100)
plotClusters(dim_red_genes.T[0], dim_red_genes.T[1], dim_red_genes.T[2],
study_data.index, sample_readinfo["series"].astype("category").cat.rename_categories([0,1,2,3]),
"Experimental Groups on 3 PCs")
biclust = SpectralBiclustering(n_clusters=4)
biclust.fit(norm_count_matrix)
fit_data = np.asarray(norm_count_matrix)[np.argsort(biclust.row_labels_)]
fit_data = fit_data[:, np.argsort(biclust.column_labels_)]
heatmap = Data([
Heatmap(
y = study_data.index[np.argsort(biclust.row_labels_)],
x = study_data.columns[np.argsort(biclust.column_labels_)],
z = fit_data,
colorscale = 'Viridis'
)
])
layout = Layout(
title='Spectral CoClustering of Gene Expression Profiles',
margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
#NMF Decomposition
nmf_decomp = NMF(n_components=4)
nmf_genes = nmf_decomp.fit_transform(norm_count_matrix, sample_readinfo["series"])
heatmap = Data([
Heatmap(
y = study_data.index,
z = np.asarray(nmf_genes),
colorscale = 'Viridis'
)
])
layout = Layout(
title='NMF Decomposition of Gene Expression Profiles',
margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
def nmf_cluster(nmf_data, threshold):
nmf_data = pd.DataFrame(nmf_data)
return nmf_data.apply(lambda x: x > threshold, axis=0);
step_size = 1.75;
min_val=0
max_val=10
tuning_range = np.arange(min_val, max_val+step_size, step_size);
order = .25*(max_val-min_val)/step_size
tuning_data = [(nmf_cluster(nmf_genes, i).sum(axis=1) == 1).sum()/nmf_genes.shape[0] for i in tuning_range]
plt.plot(tuning_range, tuning_data)
lextrema = argrelextrema(np.asarray(tuning_data), np.greater, order=floor(order))[0]
for lmin in tuning_range[lextrema]:
plt.axvline(x=lmin, color="red", lw=.5)
plt.show()
heatmap = Data([
Heatmap(
y = study_data.index,
z = np.asarray(nmf_cluster(nmf_genes, tuning_range[lextrema[0]])).astype(int),
colorscale = 'Viridis'
)
])
layout = Layout(
title='NMF Soft Clustering Assignments of Gene Expression Profiles',
margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
#NMF Decomposition
nmf_decomp = NMF(n_components=5)
nmf_genes = nmf_decomp.fit_transform(norm_count_matrix, sample_readinfo["series"])
heatmap = Data([
Heatmap(
y = study_data.index,
z = np.asarray(nmf_genes),
colorscale = 'Viridis'
)
])
layout = Layout(
title='NMF Decomposition of Gene Expression Profiles',
margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
def nmf_cluster(nmf_data, threshold):
nmf_data = pd.DataFrame(nmf_data)
return nmf_data.apply(lambda x: x > threshold, axis=0);
step_size = 1.75;
min_val=0
max_val=10
tuning_range = np.arange(min_val, max_val+step_size, step_size);
order = .25*(max_val-min_val)/step_size
tuning_data = [(nmf_cluster(nmf_genes, i).sum(axis=1) == 1).sum()/nmf_genes.shape[0] for i in tuning_range]
plt.plot(tuning_range, tuning_data)
lextrema = argrelextrema(np.asarray(tuning_data), np.greater, order=floor(order))[0]
for lmin in tuning_range[lextrema]:
plt.axvline(x=lmin, color="red", lw=.5)
plt.show()
heatmap = Data([
Heatmap(
y = study_data.index,
z = np.asarray(nmf_cluster(nmf_genes, tuning_range[lextrema[0]])).astype(int),
colorscale = 'Viridis'
)
])
layout = Layout(
title='NMF Soft Clustering Assignments of Gene Expression Profiles',
margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
sns.distplot(np.std(norm_count_matrix, axis=0))
#Limit features
gene_filter = np.std(norm_count_matrix, axis=0) > .3
limited_study_data = study_data.loc[:, gene_filter]
limited_count_matrix = norm_count_matrix[:, gene_filter]
print(count_matrix.shape, limited_count_matrix.shape)
heatmap = Data([
Heatmap(
y = limited_study_data.columns.values,
x = limited_study_data.index,
z = limited_count_matrix.T,
colorscale='Viridis'
)
])
layout = Layout(
title='Limited Gene Expression Heatmap',
yaxis=dict(type="category")
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)